Load and join data

Load libraries

library(tidyverse)

Load data

Loading raw data

## function_file:
curl::curl_download(url = "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Creviewed%2Cid%2Cgene_names%2Corganism_name%2Cprotein_name%2Ccc_cofactor%2Ccc_catalytic_activity%2Cph_dependence%2Ccc_pathway%2Ccc_function%2Cec%2Ctemp_dependence%2Cgo_f%2Ccc_subcellular_location%2Cft_transmem%2Cft_intramem%2Cprotein_families&format=tsv&query=%28%28taxonomy_id%3A9606%29+OR+%28taxonomy_id%3A10090%29%29+AND+%28reviewed%3Atrue%29",
                    destfile = "../data/_raw/function_file.tsv.gz")

## sequence_file:
curl::curl_download(url = "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Creviewed%2Cid%2Cgene_names%2Corganism_name%2Cprotein_name%2Clength%2Cmass%2Csequence%2Cft_act_site%2Cft_binding&format=tsv&query=%28%28taxonomy_id%3A9606%29+OR+%28taxonomy_id%3A10090%29%29+AND+%28reviewed%3Atrue%29",
                    destfile = "../data/_raw/sequence_file.tsv.gz")

Reading raw data

data_function <- read_delim(file = "../data/_raw/function_file.tsv.gz", delim = "\t")
data_sequence <- read_delim(file = "../data/_raw/sequence_file.tsv.gz", delim = "\t")

Join data

# joining data
joined_data <- data_function |>
  select(Entry, setdiff(colnames(data_function), colnames(data_sequence))) |>
  full_join(data_sequence,
            by = "Entry")

Save data

write_tsv(joined_data, file = "../data/01_dat_load.tsv")

Clean and join data

Load libraries

library(tidyverse)

Read data

# Reading data
joined_data <- read_delim(file = "../data/01_dat_load.tsv",
                          delim = "\t")

Cleaning part 1: Active site and binding site

# Changing space in column names to "_" and lower case letters
colnames(joined_data) <- str_replace_all(colnames(joined_data),
                                         " ",
                                         "_")
colnames(joined_data) <- str_to_lower(colnames(joined_data))

#####
# Drop all unwanted cols
#####
joined_data <- joined_data |>
  select(-reviewed,-`function_[cc]`,
         -temperature_dependence,
         -protein_names,
         -intramembrane,
         -catalytic_activity,
         -entry_name,
         -transmembrane)

####
# Tidying "active site" column
####
joined_data <- joined_data |>
  mutate(active_site = str_extract_all(active_site,
                                       "(?<=ACT_SITE )\\d+")) |>
  mutate(active_site = map_chr(active_site, ~paste(.x,
                                                   collapse = ","))) |>
  separate_longer_delim(col = active_site, ",")


####
# Tidying "binding site" column
####
joined_data <- joined_data |>
  mutate(binding_site_pos = str_extract_all(binding_site,
                                                   "(?<=BINDING\\s)\\d+(\\.\\.\\d+)?"),
         binding_site_ligand = str_extract_all(binding_site,
                                                   "(?<=/ligand=\"\")[^\"]+(?=\")")) |> 
  mutate(binding_site_pos = map_chr(binding_site_pos,
                                    ~paste(.x, collapse = ";")),
         binding_site_ligand = map_chr(binding_site_ligand,
                                       ~paste(.x, collapse = ";"))) |>
  filter(str_count(binding_site_pos, ";") == str_count(binding_site_ligand, ";")) |> 
  separate_longer_delim(col = c(binding_site_pos,
                                binding_site_ligand),
                        ";") |> 
  select(-binding_site)

# Converting to proper NA values
joined_data <- joined_data |> 
  mutate(binding_site_pos = na_if(binding_site_pos, "NA")) |> 
  mutate(binding_site_ligand = na_if(binding_site_ligand, "NA")) |> 
  mutate(active_site = na_if(active_site, "NA"))

Cleaning part 2: Cleaning the remaining columns

###
# Tidy "pathway" column
###
joined_data <- joined_data |>
  # Remove unwanted patterns (step is removed since it is a small minority that has this noted)
  mutate(pathway = str_remove_all(pathway, "^PATHWAY: "),
         pathway = str_remove_all(pathway, "pathway: "),
         pathway = str_remove_all(pathway, "\\{.*?\\}"),
         pathway = str_remove_all(pathway, ": step \\d+/\\d+"),
         pathway = str_remove_all(pathway, "\\."),
         pathway = str_remove(pathway, "\\s$")) |>
  # Split into longer form
  separate_longer_delim(col = pathway, delim = "; ") |> 
  mutate(pathway = tolower(pathway))

###
# Tidy "subcellular_location_[cc]" column
###
joined_data <- joined_data |>
  # Remove header and Notes
  mutate(`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "Note=.*"),
         `subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "SUBCELLULAR LOCATION: "),
         `subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "\\{.*?\\}"),
         `subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "\\[.*?\\]"),
         `subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "\\.\\s*$")) |>
  # Use delimiter to make data longer (there are several for cofactors)
  separate_longer_delim(col = `subcellular_location_[cc]`, delim = "; ") |>
  separate_longer_delim(col = `subcellular_location_[cc]`, delim = ". ") |>
  separate_longer_delim(col = `subcellular_location_[cc]`, delim = ", ") |>
  # Remove '.' ':' and whitespace in beginning and end of strings
  mutate(`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "^[. ]+|[. ]+$"),
         `subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "\\: ")) |> 
  # Convert to lower case
  mutate(`subcellular_location_[cc]` = tolower(`subcellular_location_[cc]`)) |> 
  # Removes space at the end
  mutate(`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "\\s$")) 

####
# Tidy "cofactor" column
####
joined_data <- joined_data |>
  # keep only cofactor names in column
  mutate(cofactor = str_extract_all(cofactor, "(?<=Name=)[^;]+")) |> 
  mutate(cofactor = map_chr(cofactor, ~paste(.x, collapse = "_"))) |> 
  separate_longer_delim(col = cofactor, "_") |> 
  mutate(cofactor = na_if(cofactor, "NA")) |> 
  mutate(cofactor = tolower(cofactor),
         cofactor = str_remove(cofactor, "\\s$"))

####
# Tidy "protein_families" column
####
joined_data <- joined_data |>
  # Use delimiter to make data longer
  separate_longer_delim(protein_families, delim = ", ")|>
  # Extract family type and make new column to put data in + remove from colum
  mutate(family_type = str_extract(protein_families, 
                                    pattern = "\\S*family\\b", group = NULL),
         protein_families = str_remove(protein_families, 
                                         pattern = "\\S*family\\b")) |>
  separate_longer_delim(protein_families, delim = "; ") |>
  # Rename column as there is only one protein family in each row now
  rename(protein_family = protein_families) |> 
  relocate(protein_family, .before = family_type)
  

####
#Tidy "gene_ontology_(molecular_function)" column
####
joined_data <- joined_data |>
  # Give column a better name
  rename(GO_molecular_function = `gene_ontology_(molecular_function)`) |> 
  # Remove GO numbers
  mutate(GO_molecular_function = str_remove_all(GO_molecular_function, "\\[.*?\\]")) |> 
  # Use delimiter to make data longer
  separate_longer_delim(GO_molecular_function, delim = " ; ") |> 
  # Removes space at the end
  mutate(GO_molecular_function = str_remove(GO_molecular_function, "\\s$")) 

####
# Tidy "pH dependence" column 
####
# Extracting the optimum pH 
pH_str_search <- "Optimum pH is (\\d+\\.\\d*)-(\\d+\\.\\d*)|Optimum pH is (\\d+\\.\\d*)|Optimum pH is around (\\d+\\.\\d*)|Optimum pH is about (\\d+\\.\\d*)|Optimum pH is (\\d+\\.\\d*) to (\\d+\\.\\d*)|Optimum pH is between (\\d+\\.\\d*) and (\\d+\\.\\d*)"

#Function for creating two new columns for pH interval values
pH_columns <- str_c("pH_", seq(from = 1, to = 2))

# Starting to clean ph_dependence
joined_data <- joined_data |>
  #Removes text and keeps optimum pH values
  mutate(ph_dependence = str_extract(ph_dependence, pH_str_search),
         ph_dependence = str_remove(ph_dependence, "Optimum pH is"),
         ph_dependence = str_remove(ph_dependence, "about|around|between"),
         ph_dependence = str_replace(ph_dependence, " and ", "-")) |>
  #Creates a new variable with pH interval
  mutate(pH_interval = str_extract(ph_dependence, "(\\d+\\.\\d*)-(\\d+\\.\\d*)")) |>
  #Separates pH interval into columns
  separate(pH_interval, into = pH_columns, sep = "-") |>
  #Inserts optimum pH values from pH dependence if it is not and interval
  mutate(pH_1 = coalesce(pH_1, ph_dependence),
         pH_2 = coalesce(pH_2, ph_dependence)) |>
  #Changes chr to dbl and calculates average pH optimum
  mutate(pH_1 = as.numeric(pH_1),
         pH_2 = as.numeric(pH_2),
         ph_dependence = (pH_1+pH_2)/2) |>
  select(-pH_1,-pH_2)

###
# Tidy "gene name" column (select the first one)
###
joined_data <- joined_data |>
  # Only keep the first gene name
  mutate(gene_name = str_extract(gene_names, "^\\S*")) |>
  # Remove gene names column
  select(-gene_names) |> 
  # Remove any duplicate entries
  distinct()

###
# Tidy "organism" column
###
joined_data <- joined_data |>
  # Check that there are only Homo sapiens (Human) and Mus musculus (Mouse)
  filter(organism == "Homo sapiens (Human)" | organism == "Mus musculus (Mouse)") |>
  # Shorten organism name
  mutate(organism = case_when(
    organism == "Homo sapiens (Human)" ~ "Human",
    organism == "Mus musculus (Mouse)" ~ "Mouse"))

Save cleaned data object

write_tsv(joined_data, file = "../data/02_dat_clean.tsv")

Augmentation of data

Load librares

library(tidyverse)

Load clean data

clean_data <- read_delim(file = "../data/02_dat_clean.tsv", delim = "\t")
Rows: 2287129 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (13): entry, cofactor, pathway, ec_number, GO_molecular_function, subcel...
dbl  (4): ph_dependence, length, mass, active_site

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Augment data

Add column with peptide sequence around active site

# Function add column with sequence around active site
get_logo_seq <- function(x, y) {
  y <- as.integer(y)
  logo_seq_length <- 7
  n_aa_per_side <- (logo_seq_length-1/2)
  return(substr(x, y-3, y+3))
}

# Extracting peptides around active site and saving in new column
aug_data <- clean_data |>
  mutate(logo_seq = get_logo_seq(sequence, active_site)) |>
  # Filtereing out k-mers not covering 7 AA's
  mutate(logo_seq = if_else(nchar(logo_seq) != 7, NA, logo_seq))

Add count for the number of molecular functions for each protein

####
# Count the number of molecular functions for each entry
####
molecular_function_count <- clean_data |>
  select(organism, entry, GO_molecular_function, length, mass) |>
  distinct() |>
  drop_na() |>
  count(entry, name = "num_molecular_functions") 

# Join num_molecular_functions
aug_data <- aug_data |>
  full_join(molecular_function_count, by = "entry")

Add number of cofactors that binds to each protein

####
# Count the number of cofactors
####
cofactor_count <- clean_data |>
  select(organism, entry, cofactor, length, mass) |>
  distinct() |>
  drop_na() |>
  count(entry, name = "num_cofactors") 

# Join num_molecular_functions
aug_data <- aug_data |>
  full_join(cofactor_count, by = "entry")

Convert the first EC number into an enzyme class number

####
# Change "EC" column (we have decided only to work with the first digit but we are aware that the others have information aswell)
####
aug_data <- aug_data |>
  # Extract the first digit and create new column
  # For the few cases with more ec numbers we just look at the first
  mutate(first_digit = str_extract(ec_number, "^\\d"),
         # Translate digit to enzyme class using vector
         enzyme_class = case_when(
           first_digit == 1  ~ "oxidoreductases",
           first_digit == 2  ~ "transferases",
           first_digit == 3  ~ "hydrolases",
           first_digit == 4  ~ "lyases",
           first_digit == 5  ~ "isomerases",
           first_digit == 6  ~ "ligases",
           first_digit == 7  ~ "translocases",
         )) |> 
  select(-first_digit, -ec_number)

Save augment data object

write_tsv(aug_data, file = "../data/03_dat_aug.tsv")

Protein size analysis

Load augmented data

library(tidyverse)
# Load data
data <- read_delim(file = "../data/03_dat_aug.tsv", delim = "\t")

Plot number of molecular functions vs length

We wanted to see if proteins with more molecular functions were larger

#####
# Plot counted number of molecular functions vs length 
#####
data |>
  select(organism,
         entry,
         length,
         num_molecular_functions) |>
  distinct() |>
  drop_na() |>
  ggplot(aes(x = length,
             y = num_molecular_functions,
             fill = organism,
             colour = organism)) + 
  geom_point(alpha = 0.5,
             size = 0.75)+
  labs(title = "Length vs. Molecular Function Count",
       x = "Length",
       y = "Number of Molecular Functions") +
  theme_bw()

ggsave("05_size_analysis_mol_func.png",
       path = "../results")
Saving 7 x 5 in image

Plot boxplot of cofactor vs length

We wanted to see if proteins that had more cofactors were generally larger. We chose to do this as a boxplot because the maximum number of cofactors was 5 and it was therefore a visually manageable plot that eased visualization substantially. it did however require that the number of cofactors was treated as a factor.

#####
# Plot counted number of cofactors functions vs length 
#####
data |>
    select(organism,
           entry,
           length,
           num_cofactors)|>
    distinct()|>
    drop_na()|>
    # num_cofactors is converted to factor to enable use of boxplot
    ggplot(aes(y = length,
               fill = factor(num_cofactors),
               colour = factor(num_cofactors))) + 
    geom_boxplot(alpha = 0.5,
                 size = 0.75)+
    labs(title = "Boxplot of length by number of binding cofactors",
         y = "Length",
         fill = "Number of cofactors",
         color = "Number of cofactors") +
    facet_wrap(~organism)+
    # Extreme outliers are ignored to improve visualization
    scale_y_continuous(limits = c(0, quantile(data$length, 0.98))) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).

ggsave("05_size_analysis_cofactor_size.png",
       path = "../results")
Saving 7 x 5 in image
Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).

Plot size vs mass in kDa

We were wondering if humans used more large side chains than mice. To investigate this we plotted length vs mass and performed linear regression. We made one linear model for each organism and plotted the 95% confidence interval as to enable easy visualization to check if they were significantly different from each other. Should this be the case it would indicate that the two species do in fact have differences in the size of their amino acid side chains.

#####
# Plot size vs mass (fix the structure to use pipes)
#####
data|>
  select(organism,
       entry,
       length,
       mass)|>
  distinct()|>
  drop_na()|>
  # Create scatter plot with linear regression lines
  ggplot(aes(x = mass,
                 y = length,
                 color = organism)) +
    geom_point(alpha = 0.25,
               size = 0.75) +
    # Add linear regression line for human
    geom_smooth(data = data |>
                  filter(organism == "Human"),
                  method = "lm",
                  se = TRUE,
                  formula = y ~ x,
                  linetype = "solid",
                  colour = "red")+
    # Add linear regression line Mouse
    geom_smooth(data = data |>
                  filter(organism == "Mouse"),
                  method = "lm",
                  se = TRUE,
                  formula = y ~ x,
                  linetype = "dashed",
                  colour = "blue")+
    labs(title = "Scatter Plot of Length vs. Mass with Linear Regressions",
         x = "Mass (kDa)",
           y = "Length",
           color = "Organism") +
    # Extreme outliers are ignored to improove visualization
    scale_y_continuous(limits = c(0, quantile(data$length, 0.99))) +
    scale_x_continuous(labels = function(x) format(x, scientific = FALSE),
                       limits = c(0, quantile(data$mass, 0.99)))+
    theme_bw()
Warning: Removed 10799 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12163 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 111 rows containing missing values (`geom_point()`).
Warning: Removed 2 rows containing missing values (`geom_smooth()`).
Warning: Removed 3 rows containing missing values (`geom_smooth()`).

ggsave("05_size_analysis_lm_size_length.png",
      path = "../results")
Saving 7 x 5 in image
Warning: Removed 10799 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12163 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 111 rows containing missing values (`geom_point()`).
Warning: Removed 2 rows containing missing values (`geom_smooth()`).
Warning: Removed 3 rows containing missing values (`geom_smooth()`).
linear_models <- lm(length ~ mass + organism,
                    data = data|>
                              select(organism,
                                   entry,
                                   length,
                                   mass)|>
                              distinct()|>
                              drop_na())

pH Length investigation

The idea was to see if there was any connection between protein size and the optimal pH for the protein. The datapoints are coloured by organism to see if there are any clear differences between mice and humans.

####
# Assemble size-pH
####
data|>
  select(organism,
         entry,
         length,
         mass,
         ph_dependence)|>
  distinct()|>
  drop_na()|>
  # 368 distinct entries
  # Plot a scatter plot of the 
  ggplot(aes(x= length,
             y = ph_dependence,
             fill = organism,
             colour = organism)) + 
  geom_point(alpha = 0.75,
             size = 0.75)+
  labs(title = "Scatter Plot of length vs. pH",
       x = "Sequence length",
       y = "pH optimum")

ggsave("05_size_analysis_pH_length.png",
       path = "../results")
Saving 7 x 5 in image

pH enzyme class comparison (needs to be moved)

We decided to test if enzyme classes had varying pH dependencies between humans and mice. We therefore made a boxplot of the pH dependencies on all enzyme classes and stratified on the organism to compare the two.

# Needs to be moved
####
# pH by enzyme class test 
####
data|>
  # Filter remoove unwanted columns and rows from data
  select(organism,
         entry,
         length,
         mass,
         ph_dependence,
         enzyme_class) |>
  distinct() |>
  drop_na() |>
  # Plot the results  
  ggplot(aes(y = ph_dependence, 
             fill = enzyme_class, 
             colour = enzyme_class)) +
  geom_boxplot(alpha = 0.75, size = 0.9) +
  labs(color = "Enzyme class",
       fill = "Enzyme class",
       y = "pH optimum") +
  facet_wrap(~organism)

# 310 distinct entries
data|>
  select(organism,
         entry,
         ph_dependence,
         enzyme_class)|>
  filter(organism == "Mouse",
         enzyme_class == "ligases")|>
  drop_na()|>
  distinct(entry)
# A tibble: 5 × 1
  entry 
  <chr> 
1 A4Q9F0
2 P28650
3 P46664
4 P48760
5 Q3THK7
# Only 5 ligases in mice which is not enough to determine anything 

Plotting Length vs enzyme class

The idea was to see if mice and human proteins varied in size when looking at specific enzyme classes.

####
# Size by enzyme class test 
####
data|>
    select(organism,
           entry,
           length,
           mass,
           enzyme_class) |>
    distinct() |>
    drop_na() |>
    ggplot(aes(y = length,
               fill = enzyme_class,
               colour = enzyme_class)) +
    geom_boxplot(alpha = 0.75, size = 0.9) +
    labs(color = "Enzyme class",
       fill = "Enzyme class",
       y = "Length") + 
    facet_wrap(~organism) +
    scale_y_continuous(limits = c(0, quantile(data$length, 0.98))) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
Warning: Removed 92 rows containing non-finite values (`stat_boxplot()`).

# 310 distinct entries

ggsave("05_size_analysis_class_size.png",
       path = "../results")
Saving 7 x 5 in image
Warning: Removed 92 rows containing non-finite values (`stat_boxplot()`).

Plotting size by subcellular loaction

The idea was to see if protein sizes varied in subcellular locations and to check if there were any differences between mice and humans.

####
# Size by subcellular location 
####

# Plot boxplots for protein sizes in three subcellular locations
data |>
  filter(`subcellular_location_[cc]` == c("cell membrane", "cytoplasm","nucleus")) |>
  ggplot(aes(y = length,
             fill = `subcellular_location_[cc]`,
             colour = `subcellular_location_[cc]`)) +
  geom_boxplot(alpha = 0.75, size = 0.9) +
  facet_wrap(~organism) +
  scale_y_continuous(limits = c(0, quantile(data$length, 0.99))) +
  labs(title = "Boxplot of protein lengths in three subcellular locations",
       y = "Protein Length",
       fill = "Subcellular location",
       color = "Subcellular location") +
  theme(legend.position = "bottom")
Warning: There was 1 warning in `filter()`.
ℹ In argument: `==...`.
Caused by warning in `` `subcellular_location_[cc]` == c("cell membrane", "cytoplasm", "nucleus") ``:
! longer object length is not a multiple of shorter object length
Warning: Removed 2392 rows containing non-finite values (`stat_boxplot()`).

ggsave("05_size_analysis_location_size.png",
       path = "../results")
Saving 7 x 5 in image
Warning: Removed 2392 rows containing non-finite values (`stat_boxplot()`).

Decription of the data used in this project

Loading libraries

## Loading libraries
library(tidyverse)
library(knitr)
library(patchwork)

Loading data

Introduction

We have downloaded the data of all the proteins of mouse and human in the Swiss-Prot database. The Swiss-Prot database is a freely accessible database of manually annotated and non-redundant proteins. The annotations are regularly reviewed to include potentially new findings regarding a specific protein. The same gene from the same species only get one entry in the database to avoid redundancy. All this information about the database makes the data trustworthy which is important for us. The goal is to examine the data to see if it holds anything interesting on a large-scale analysis where we use the general categories. We have decided to focus on mouse and human. We expect the two organisms to show high similarity as they are both mammals and the research of proteins in human and mouse often include the use of ortolog comparison. However, we dug into the data to see if we despite high similarity could extract some differences. Or are we just very much like the mouse? The data was fun to work with as it was untidy and required a lot of cleaning before we could start with the analysis. It was a high motivation for us to practice the data wrangling part as much as possible as we saw this part as the most challenging content in the course. The following part will show some general descriptive statistics we made to get an overview of the data, we ended up using for our project.

Descriptive statistics

The data includes information about the function of each of the 36990 proteins and the sequence and much more. For a start, we merge the two raw data files (function and sequence) into one. From this point, we started excluding variables which we did not want to continue with. We excluded the variables reviewed, function, temperature_dependence, protein_names, intramembrane, catalytic_activity, entry_name, and transmembrane. These columns were not informative as some of them were just long text descriptions, (e.g., function), reaction equations (e.g., catalytic_activity), redundant (entry_name), or useless. After some data wrangling (which can be seen in “02_clean.qmd”), we ended up with the following variables: The data includes information about the function, the sequence, and much more of each of the 36990 proteins. For a start, we merged the two raw data files (function and sequence) into one. From this point, we started excluding variables that we did not want to keep. We excluded the variables “reviewed”, “function”, “temperature_dependence”, “protein_names”, “intramembrane”, “catalytic_activity”, “entry_name”, and “transmembrane”. These columns were not informative as some of them were just long text descriptions, (e.g., “function”), reaction equations (e.g., “catalytic_activity”), redundant (“entry_name”), or useless. After some data wrangling (which can be seen in “02_clean.qmd”), we ended up with the following variables:

The different variables (columns) in the dataset:

# Extracting the column names
data |> 
  select(-entry, -gene_name) |> 
  colnames()
 [1] "cofactor"                  "ph_dependence"            
 [3] "pathway"                   "GO_molecular_function"    
 [5] "subcellular_location_[cc]" "organism"                 
 [7] "length"                    "mass"                     
 [9] "sequence"                  "active_site"              
[11] "binding_site_pos"          "binding_site_ligand"      
[13] "protein_family"            "family_type"              
[15] "logo_seq"                  "num_molecular_functions"  
[17] "num_cofactors"             "enzyme_class"             

Originally the dataset also contained entries from other mouse-species and from neanderthal. These have been removed. The proteins are described with 15 variables (including the species):

  • pathway:
    • e.g., purine metabolism, protein modification, protein ubiquitination
  • cofactor:
    • e.g., heme, mg(2+), zn(2+)
  • ph_dependence
  • subcellular_location_[cc]
    • e.g., cytoplasm, nucleus, cell membrane
  • GO_molecular_function
    • e.g., metal ion binding, ATP binding, G protein-coupled receptor activity
  • organism:
    • Mouse or Human
  • length
    • The lenght of the amino acid sequence
  • mass
    • Mass of the protein in kilo dalton [kDa]
  • sequence
    • The amino acid sequence
  • active_site
    • Position of the active site
  • binding_site_pos
    • Position of the binding site
  • binding_site_ligand
    • e.g., Ca(2+), heme, ATP
  • protein_family
    • e.g., Protein kinase, G-protein coupled receptor 1
  • family_type
    • family / subfamily
  • enzyme_class
    • e.g., transferases, hydrolases

Protein amount for each organism

We do not see a great difference in the amount of proteins described for mouse and human. The following plot visualizes how similar the count is.

## This plot shows the total count of human proteins in the database vs. mouse proteins in the database
data |>
  ggplot(mapping = aes(x=organism,
             fill = organism)) +
  geom_bar() +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_text(angle = 0, 
                                    vjust = 0.5, 
                                    size = 10),
        axis.text.x = element_text(angle = 0, 
                                   size = 10)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Total count of human and mouse",
       y = "Count",
       x = "Organism")

## Saving the plot in results
ggsave("04_descibe_plot1_totalcount.png",
       path = "../results")
Saving 7 x 5 in image

The low difference in amount of proteins for mouse and human respectively is illustrating the low difference between mouse and human in general. The following plots shows the difference in length (first plot) and mass (second plot) for mouse and human. It is clear that the density plots for mouse and human respectively are very similar. This indicates that the proteins described in human has an ortholog in mouse which explains the low difference in length and mass.

Density plot for length of the proteins in the database stratified on mouse and human

## Density plot for the length of the proteins stratified on mouse and human
data |>
  ggplot(aes(x = length, 
             fill = organism)) +
  xlim(0, 2500) +
  geom_density(alpha = 0.5) +
  theme_bw() +
  theme(axis.title.y = element_text(angle = 0, 
                                    vjust = 0.5, 
                                    size = 11),
        axis.text.x = element_text(angle = 40, 
                                   hjust = 1, 
                                   size = 11)) +
  labs(title = "Density plot of the length of the proteins in the database",
       y = "Density",
       x = "Length",
       fill = "Organism")

## Saving the plot in results
ggsave("04_descibe_plot2_density_length.png",
       path = "../results")

Density plot for the mass of the proteins in the database stratified on mouse and human

## Density plot for the mass of the proteins stratified on mouse and human
data |>
  ggplot(aes(x = mass, 
             fill = organism)) +
  geom_density(alpha = 0.5) +
  scale_x_continuous(
    labels = function(x) format(x, scientific = FALSE),
    limits = c(0, 300000)
  ) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme_bw() +
  theme(axis.title.y = element_text(angle = 0, 
                                    vjust = 0.5, 
                                    size = 11),
        axis.text.x = element_text(angle = 40, 
                                   hjust = 1, 
                                   size = 11)) +
  labs(title = "Density plot of the mass of the proteins in the database",
       y = "Density",
       x = "Mass",
       fill = "Organism")

## Saving the plot in results
ggsave("04_descibe_plot3_density_mass.png",
       path = "../results")
## Making a patchwork of the two density plots above
## Density plot for the length
length_density_plot <- data |>
  ggplot(aes(x = length, 
             fill = organism)) +
  xlim(0, 2500) +
  geom_density(alpha = 0.5) +
  theme_bw() +
  theme(axis.title.y = element_text(angle = 0, 
                                    vjust = 0.5, 
                                    size = 11),
        axis.text.x = element_text(angle = 40, 
                                   hjust = 1, 
                                   size = 11)) +
  labs(y = "Density",
       x = "Length") +
  guides(fill = FALSE)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
## Density plot for the mass
mass_density_plot <- data |>
  ggplot(aes(x = mass, 
             fill = organism)) +
  geom_density(alpha = 0.5) +
  scale_x_continuous(
    labels = function(x) format(x, scientific = FALSE),
    limits = c(0, 300000)
  ) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, 
                                   hjust = 1, 
                                   size = 11),
        axis.title.y = element_blank()) +
  labs(x = "Mass",
       fill = "Organism")

## Combine the plots
combined_length_mass <- length_density_plot + 
  mass_density_plot

## Print the combined plots
print(combined_length_mass)
Warning: Removed 55729 rows containing non-finite values (`stat_density()`).
Warning: Removed 32649 rows containing non-finite values (`stat_density()`).

## Saving the plot in results
ggsave("04_descibe_plot2and3_density_length_and_mass.png",
       path = "../results")
Saving 7 x 5 in image
Warning: Removed 55729 rows containing non-finite values (`stat_density()`).
Removed 32649 rows containing non-finite values (`stat_density()`).

Unique entries of each variable

The variables that we kept have a different count of unique entries, e.g., there are 7 different entries for “enzyme_class” and 538 different “GO_molecular_function”. The following table counts the number of unique entries of each variable.

## Table of the unique entries of each variable

summary_table <- data |>
  select(
    cofactor, 
    `subcellular_location_[cc]`, 
    GO_molecular_function, 
    protein_family, 
    enzyme_class, 
    pathway) |>
  drop_na() |>
  pivot_longer(
    cols = c(
      cofactor, 
      `subcellular_location_[cc]`, 
      GO_molecular_function, 
      protein_family, 
      enzyme_class, 
      pathway
    ),
    names_to = "Column",
    values_to = "Value"
  ) |>
  group_by(Column) |>
  summarise(Unique_Entries = n_distinct(Value))

# Using kable() to make the table
kable(summary_table)
Column Unique_Entries
GO_molecular_function 576
cofactor 31
enzyme_class 7
pathway 246
protein_family 179
subcellular_location_[cc] 91

To further improve the overview of the unique entries, we have made a bar plot showing the same as the table:

Illustrated with a bar plot as well

## Bar plot of the unique entries for each variable

data |> 
  select(cofactor, 
         `subcellular_location_[cc]`, 
         GO_molecular_function, 
         protein_family, 
         enzyme_class, 
         pathway) |> 
   drop_na()|> 
   pivot_longer(cols = c(cofactor,
                        `subcellular_location_[cc]`,
                        GO_molecular_function, protein_family,
                        enzyme_class, pathway),
               names_to = "Column", 
               values_to = "Value") |> 
  group_by(Column) |>
  summarise(Unique_Entries = n_distinct(Value)) |> 
  ggplot(aes(x=Column, 
             y=Unique_Entries, 
             fill = Column)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(
    axis.title.y = element_text(angle = 0, 
                                vjust = 0.5, 
                                size = 11),
    axis.text.x = element_text(angle = 40, 
                               hjust = 1, 
                               size = 11),
        ) +
  guides(fill = FALSE) +
  labs(title = "Amounts of unique entries for the different variables",
       x = "Variable",
       y = "Unique entries")

## Saving the plot in results
ggsave("04_descibe_plot4_unique_entries.png",
       path = "../results")
Saving 7 x 5 in image

The unique entries of the variable “enzyme_class”

Now we dig into a specific variable to give an example of how the variables look like. The “enzyme_class” contains seven unique entries. The amount of each entry is shown in the plot below:

data |>
  select(enzyme_class) |>
  drop_na() |> 
  group_by(enzyme_class) |>
  summarise(Amount = n()) |>
  ggplot(aes(x=enzyme_class, 
             y=Amount, 
             fill=enzyme_class)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(
    axis.title.y = element_text(angle = 0, 
                                vjust = 0.5, 
                                size = 11),
    axis.text.x = element_text(angle = 40, 
                               hjust = 1, 
                               size = 11),
        ) +
  guides(fill = FALSE) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Amount of different enzyme classes",
       y = "Amount",
       x = "Enzyme class")

In this project, we have primarilt tried to examine a difference between mouse and human. The following plot is an example of how we have dug into the data with the intention to visualized the difference between mouse and human in different categories.

## Amount of different enzyme classes in Human
enzymes_Human <- data |>
  select(enzyme_class, organism) |>
  drop_na() |> 
  filter(organism == "Human") |> 
  group_by(enzyme_class) |>
  summarise(Amount = n()) |>
  ggplot(aes(x=enzyme_class, 
             y=Amount, 
             fill=enzyme_class)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 40, 
                                   hjust = 1, 
                                   size = 10),
        plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Human") +
  guides(fill = FALSE)

## Amount of different enzyme classes in Mouse
enzyme_Mouse <-  data |>
  select(enzyme_class, 
         organism) |>
  drop_na() |> 
  filter(organism == "Mouse") |> 
  group_by(enzyme_class) |>
  summarise(Amount = n()) |>
  ggplot(aes(x=enzyme_class, 
             y=Amount, 
             fill=enzyme_class)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 40, 
                                   hjust = 1, 
                                   size = 10),
        plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Mouse") +
  guides(fill = FALSE)

## Combining the two plots with patchwork
p <- enzymes_Human + 
  enzyme_Mouse +
  plot_annotation("Amount of different enzyme classes in human and mouse",
                  theme=theme(plot.title=element_text(hjust=0.5)))
p

## Saving the plot in results
ggsave("04_descibe_plot5_different_enzyme_classes.png",
       path = "../results")
Saving 7 x 5 in image

Sequence logo plots for peptides around active site

In this analysis we will create and compare logo plots for human- and mouse-entries with available “active site” information in uniprot.

We do so first building sequence logo plots for different protein families. Later we zoom out and look at enzyme classes instead, as defined by the first digit of the EC number (see data augmentation steps).

Load librares

library(ggseqlogo)
library(patchwork)
library(tidyverse)
library(knitr)

Load data

aug_data <- read_delim(file = "../data/03_dat_aug.tsv",
                       delim = "\t")

Identify protein families to look at

The purpose of this step is to identify the protein families where most information is available.
The table in the output message will show the 10 most frequently observed protein families as an average across human and mice.

# Filter out if no active site info
logoseq_data <- aug_data |> 
  drop_na(logo_seq) |> 
  filter(family_type == "family") |> 
  select(entry,
         organism,
         protein_family,
         family_type,
         logo_seq)

# See what protein families with active site data occur most often in mouse
top_prot_fam_mouse <- logoseq_data |>
  drop_na(protein_family) |> 
  filter(str_detect(organism,
                    "Mouse")) |> 
  select(entry,
         protein_family,
         logo_seq) |> 
  distinct() |> 
  group_by(protein_family) |> 
  summarize(n_mouse = n()) |>
  arrange(desc(n_mouse))

# See what protein families with active site data occur most often in human
top_prot_fam_human <- logoseq_data |>
  drop_na(protein_family) |> 
  filter(organism == "Human") |>
  select(entry, protein_family, logo_seq) |> 
  distinct() |> 
  group_by(protein_family) |> 
  summarize(n_human = n()) |>
  arrange(desc(n_human))

# Combine mouse and human protein family counts and available active site information
top_prot_fam <- top_prot_fam_mouse |> 
  full_join(top_prot_fam_human,
            by = "protein_family") |> 
  mutate(n_average = (n_mouse + n_human) /2) |>
  arrange(desc(n_average))

top_prot_fam |> 
  slice(1:10)|> knitr::kable()
protein_family n_mouse n_human n_average
Peptidase S1 363 326 344.5
Peptidase C19 100 133 116.5
Protein-tyrosine phosphatase 92 97 94.5
Tyr protein kinase 88 86 87.0
CAMK Ser/Thr protein kinase 79 72 75.5
AGC Ser/Thr protein kinase 60 64 62.0
CMGC Ser/Thr protein kinase 61 62 61.5
Ser/Thr protein kinase 62 60 61.0
Lipase 54 63 58.5
STE Ser/Thr protein kinase 54 54 54.0
# Extract names of protein families in arranged order, to use for further analysis
top_prot_fam_name_vec <- top_prot_fam |> 
  pull(protein_family)

Logo plots for protein families

Here we use ggseqlogo() to visualize peptide sequences for each of the top 5 most frequently observed protein families as logo plots. The logo plots will be stitched together using both the inbuilt facet function of gggseqlogo() and patchwork.

The logo plot for mouse and human protein families are very similar, showing that mouse and human homologs very similar around the active sites. In addition, this analysis show how uniprot data can be utilized to gain knowledge about active sites for specific protein families.

### Protein family logo plots ###
# In this code chunk we work with the top 5 most frequently observed protein families

# Extract mouse data and filter out if no active site info
logoseq_data_mouse_prot <- aug_data |> 
  drop_na(logo_seq, protein_family) |>
  filter(protein_family %in% top_prot_fam_name_vec[1:5]) |> 
  filter(str_detect(organism, "Mouse")) |> 
  select(entry,
         organism,
         protein_family,
         logo_seq) |> 
  distinct()

# Extract human data and filter out if no active site info
logoseq_data_human_prot <- aug_data |> 
  drop_na(logo_seq, protein_family) |>
  filter(protein_family %in% top_prot_fam_name_vec[1:5]) |> 
  filter(str_detect(organism, "Human")) |> 
  select(entry,
         organism,
         protein_family,
         logo_seq) |> 
  distinct()

# Change proten family names to get line breaks (check if names match before changing them)
# Converting data to named list for use with facet function in ggseqlogo
# For mouse
logo_list_mouse_prot <- logoseq_data_mouse_prot |>
  select(logo_seq, protein_family) |>
  group_split(protein_family) |> 
  map(~ pull(.x, logo_seq)) |> 
  map(unique)  # Filter out identical sequences

names(logo_list_mouse_prot) <- c("CAMK Ser/Thr\nprotein kinase",
                                 "Peptidase C19 ",
                                 "Peptidase S1 ",
                                 "Protein-tyrosine\nphosphatase ",
                                 "Tyr protein\nkinase ")

# For human
logo_list_human_prot <- logoseq_data_human_prot |>
  select(logo_seq, protein_family) |>
  group_split(protein_family) |> 
  map(~ pull(.x, logo_seq)) |> 
  map(unique)  # Filter out identical sequences

names(logoseq_data_human_prot) <- c("CAMK Ser/Thr\nprotein kinase",
                                 "Peptidase C19 ",
                                 "Peptidase S1 ",
                                 "Protein-tyrosine\nphosphatase ",
                                 "Tyr protein\nkinase ")
Warning: The `value` argument of `names<-` must have the same length as `x` as of tibble
3.0.0.
# Making logo plot for mouse protein families
logo_plot_mouse_prot <- ggseqlogo(logo_list_mouse_prot,
                                  ncol=7) +
  ggtitle("Mouse protein families") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 12),
        strip.text = element_text(size = 8)) +
  scale_x_continuous(breaks = 1:7,  # Existing tick positions
                     labels = -3:3  # Existing tick labels
  )

# Making logo plot for human protein families
logo_plot_human_prot <- ggseqlogo(logo_list_human_prot,
                                  ncol=7) +
  ggtitle("Human protein families") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 12),
        strip.text = element_text(size = 8)) +
  scale_x_continuous(breaks = 1:7,  # Existing tick positions
                     labels = -3:3  # Existing tick labels
  )

# Combining mouse and human logo plot for protein families
logo_comb_prot <- logo_plot_human_prot / logo_plot_mouse_prot
logo_comb_prot

# Save combined logo plot as png
ggsave(filename = "../results/06_logoplot_prot_fam.png",
       logo_comb_prot)

Logo plots for enzyme class

Here we use ggseqlogo() to visualize peptide sequences for different types of enzyme classes according to the first digit in their EC number. We filtered out “translocases”, because very few entries very observed for this enzyme class resulting in an artifically high information content. The logo plots will be stitched together using both the inbuilt facet function of gggseqlogo() and patchwork.

Just as for the protein family logo plots, the logo plots for mouse and human enzyme classes are very similar. Enzyme classes are less specific than protein families, and thus we naturally observe lower information content in general. However we thought that “zooming out”, might reveal some differences between human and mice enzymes in general, but this was not the case.

### Enzyme class logo plots ###

# Extract mouse data and filter out if no active site info
logoseq_data_mouse_enz <- aug_data |>
  drop_na(logo_seq, enzyme_class) |>
  filter(str_detect(organism,
                    "Mouse")) |>
  # Removing translocases because limited data available
  filter(!enzyme_class == "translocases") |>
  select(entry,
         organism,
         enzyme_class,
         logo_seq) |>
  distinct()

# Extract human data and filter out if no active site info
logoseq_data_human_enz <- aug_data |>
  drop_na(logo_seq, enzyme_class) |>
  filter(str_detect(organism,
                    "Human")) |>
  # Removing translocases because limited data available
  filter(!enzyme_class == "translocases") |>
  select(entry,
         organism,
         enzyme_class,
         logo_seq) |>
  distinct()

# Converting data to named list for facet function in ggseqlogo: mouse data
logo_list_mouse_enz <- logoseq_data_mouse_enz |>
  select(logo_seq, enzyme_class) |>
  group_split(enzyme_class) |> 
  map(~ pull(.x, logo_seq)) |> 
  map(unique)  # Filter out identical sequences

names(logo_list_mouse_enz) <- c("Hydrolases",
                                 "Isomerases",
                                 "Ligases",
                                 "Lyases",
                                 "Oxidoreductases",
                                 "Transferases")

# Converting data to named list for facet function in ggseqlogo: human data
logo_list_human_enz <- logoseq_data_human_enz |>
  select(logo_seq, enzyme_class) |>
  group_split(enzyme_class) |> 
  map(~ pull(.x, logo_seq)) |> 
  map(unique)  # Filter out identical sequences

names(logo_list_human_enz) <- c("Hydrolases",
                                 "Isomerases",
                                 "Ligases",
                                 "Lyases",
                                 "Oxidoreductases",
                                 "Transferases")

# Making logo plots for mouse enzyme classses
logo_plot_mouse_enz <- ggseqlogo(logo_list_mouse_enz,
                                 ncol=6) +
  ggtitle("Mouse enzyme classes") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 12),
        strip.text = element_text(size = 8)) +
  scale_x_continuous(breaks = 1:7,  # Existing tick positions
                     labels = -3:3  # Existing tick labels
  )

# Making logo plots for human enzyme classses
logo_plot_human_enz <- ggseqlogo(logo_list_human_enz,
                                 ncol=6) +
  ggtitle("Human enzyme classes") +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 12),
        strip.text = element_text(size = 8)) +
  scale_x_continuous(breaks = 1:7,  # Existing tick positions
                     labels = -3:3  # Existing tick labels
  )

# Combining mouse and human logo plots for enzyme classes
logo_comb_enz <- logo_plot_human_enz / logo_plot_mouse_enz
logo_comb_enz

# Save combined logo plot
ggsave(filename = "../results/06_logoplot_enz.png",
       logo_comb_enz)

Location Analysis

Load data

library(tidyverse)
data_clean <- read_delim(file = "../data/03_dat_aug.tsv", delim = "\t")

Analysis

We will look at the distribution of enzyme classes, molecular functions and pathways in the three most populated subcellular locations to see if they are differently distributed. These three locations are the cell membrane, the cytoplasm and the nucleus.

Enzyme class and subcellular location

There are 7 enzyme classes representing 7 types of enzyme catalyzed reactions. The distribution of enzyme classes is very similar between human and mice but there are some subtle differences between the three subcellular locations. There are more translocases in the cell membrane than in the other subcellular locations. The high relative number of translocases in the cell membrane makes great biological sense as they catalyze the transport of molecules and ions across membranes among other things.

data_clean |>
  drop_na(enzyme_class, `subcellular_location_[cc]`) |>
  select(entry, enzyme_class, organism, `subcellular_location_[cc]`) |>
  distinct() |>
  filter(`subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane")) |>
  ggplot(aes(x = `subcellular_location_[cc]`,
             color = enzyme_class,
             fill = enzyme_class)) +
  geom_bar(position = "fill") +
  labs(title = "Enzyme classes in subcellular locations",
       subtitle = "Distribution of the 7 enzyme classes in three subcellular locations",
       x = "Subcellular location",
       y = "Relative count",
       color = "Enzyme class",
       fill = "Enzyme class") +
  theme_bw() +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 8),
        legend.title = element_text(size = 10),
        plot.subtitle = element_text(size = 10)) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE),
         color = guide_legend(nrow = 2, byrow = TRUE)) +
  facet_wrap(~organism)
Warning: There was 1 warning in `filter()`.
ℹ In argument: ``subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell
  membrane")`.
Caused by warning in `` `subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane") ``:
! longer object length is not a multiple of shorter object length

ggsave("07_location_analysis_EC.png",
       path = "../results")
Saving 7 x 5 in image

Molecular function in subcellular location

The molecular functions are very equally distributed in human and mice but there are differences between the three subcellular locations. There are many more DNA binding proteins in the nucleus where DNA is stored. There are more ATP binding proteins in the cell membrane and cytoplasm compared to the nucleus. There are also more identical protein binding proteins in the cell membrane i.e. it appears that homogeneous protein-protein interactions are more common in the cell membrane than in the two other subcellular locations.

# Vector with top 5 molecular functions
top5_GO_molecular_function <- data_clean |>
  drop_na(GO_molecular_function) |>
  select(entry, GO_molecular_function) |>
  distinct() |>
  group_by(GO_molecular_function) |>
  summarize(count = n()) |>
  arrange(desc(count)) |>
  pull(GO_molecular_function) |>
  head(5)
# Plot of molecular function vs subcellular location
data_clean |>
  drop_na(GO_molecular_function, `subcellular_location_[cc]`) |>
  select(entry, GO_molecular_function, organism, `subcellular_location_[cc]`) |>
  distinct() |>
  filter(`subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane"),
         GO_molecular_function == top5_GO_molecular_function) |>
  ggplot(aes(x = `subcellular_location_[cc]`,
             color = GO_molecular_function,
             fill = GO_molecular_function)) +
  geom_bar(position = "fill") +
  labs(title = "Molecular functions in subcellular locations",
       subtitle = "Distribution of the top 5 molecular functions in three subcellular locations",
       x = "Subcellular location",
       y = "Relative count",
       color = "Molecular function",
       fill = "Molecular function") +
  theme_bw() +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 7),
        legend.title = element_text(size = 10),
        plot.subtitle = element_text(size = 10)) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE),
         color = guide_legend(nrow = 2, byrow = TRUE)) +
  facet_wrap(~organism)
Warning: There was 1 warning in `filter()`.
ℹ In argument: `GO_molecular_function == top5_GO_molecular_function`.
Caused by warning in `GO_molecular_function == top5_GO_molecular_function`:
! longer object length is not a multiple of shorter object length

ggsave("07_location_analysis_mol_func.png",
       path = "../results")
Saving 7 x 5 in image

Pathways in subcellular location

Compared to the molecular functions and enzyme classes, there is a greater difference between the pathways in human and mice. However, there is also less data about the pathways in the three subcellular locations which might be the reason behind this apparent difference between organisms. In human, proteins involved in amino acid biosynthesis and degradation are found in the cytoplasm but none of these are found in mice. Mice obviously also have amino acid metabolism which ought to take place in the cytoplasm like it does in human. Greater annotation of the proteins is needed to be able to compare the pathways in human and mice.

# Vector with top 10 pathways
top10_pathway <- data_clean |>
  drop_na(pathway) |>
  select(entry, pathway) |>
  distinct() |>
  group_by(pathway) |>
  summarize(count = n()) |>
  arrange(desc(count)) |>
  pull(pathway) |> 
  head(10)
# Plot of pathway vs subcellular location
data_clean |>
  drop_na(pathway, `subcellular_location_[cc]`) |>
  select(entry, pathway, organism, `subcellular_location_[cc]`) |>
  distinct() |>
  filter(`subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane"),
         pathway == top10_pathway) |>
  ggplot(aes(x = `subcellular_location_[cc]`,
             color = pathway,
             fill = pathway)) +
  geom_bar() +
  labs(title = "Pathways in subcellular locations",
       subtitle = "Distribution of the top 10 pathways in three subcellular locations",
       x = "Subcellular location",
       y = "Count",
       color = "Pathway",
       fill = "Pathway") +
  theme_bw() +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 5),
        legend.title = element_text(size = 9),
        plot.subtitle = element_text(size = 10)) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE),
         color = guide_legend(nrow = 2, byrow = TRUE)) +
  facet_wrap(~organism)
Warning: There were 2 warnings in `filter()`.
The first warning was:
ℹ In argument: ``subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell
  membrane")`.
Caused by warning in `` `subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane") ``:
! longer object length is not a multiple of shorter object length
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

ggsave("07_location_analysis_pathways.png",
       path = "../results")
Saving 7 x 5 in image